Overview¶

Objective:¶

  • Through Exploratory Data Analysis, can we roughly approximate the details of these shooting incidents and come to some preliminary conclusions regarding the high and low risk times and locations in New York City for shooting incidents.
In [2]:
import pandas as pd
import numpy as np
import altair as alt
import folium
from folium.plugins import MarkerCluster, HeatMap

# Load data
data_url = "https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD"
df = pd.read_csv(data_url)

# Display dataset info
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28562 entries, 0 to 28561
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   INCIDENT_KEY             28562 non-null  int64  
 1   OCCUR_DATE               28562 non-null  object 
 2   OCCUR_TIME               28562 non-null  object 
 3   BORO                     28562 non-null  object 
 4   LOC_OF_OCCUR_DESC        2966 non-null   object 
 5   PRECINCT                 28562 non-null  int64  
 6   JURISDICTION_CODE        28560 non-null  float64
 7   LOC_CLASSFCTN_DESC       2966 non-null   object 
 8   LOCATION_DESC            13585 non-null  object 
 9   STATISTICAL_MURDER_FLAG  28562 non-null  bool   
 10  PERP_AGE_GROUP           19218 non-null  object 
 11  PERP_SEX                 19252 non-null  object 
 12  PERP_RACE                19252 non-null  object 
 13  VIC_AGE_GROUP            28562 non-null  object 
 14  VIC_SEX                  28562 non-null  object 
 15  VIC_RACE                 28562 non-null  object 
 16  X_COORD_CD               28562 non-null  float64
 17  Y_COORD_CD               28562 non-null  float64
 18  Latitude                 28503 non-null  float64
 19  Longitude                28503 non-null  float64
 20  Lon_Lat                  28503 non-null  object 
dtypes: bool(1), float64(5), int64(2), object(13)
memory usage: 4.4+ MB
Out[2]:
INCIDENT_KEY OCCUR_DATE OCCUR_TIME BORO LOC_OF_OCCUR_DESC PRECINCT JURISDICTION_CODE LOC_CLASSFCTN_DESC LOCATION_DESC STATISTICAL_MURDER_FLAG ... PERP_SEX PERP_RACE VIC_AGE_GROUP VIC_SEX VIC_RACE X_COORD_CD Y_COORD_CD Latitude Longitude Lon_Lat
0 244608249 05/05/2022 00:10:00 MANHATTAN INSIDE 14 0.0 COMMERCIAL VIDEO STORE True ... M BLACK 25-44 M BLACK 986050.000 214231.000000 40.754692 -73.993500 POINT (-73.9935 40.754692)
1 247542571 07/04/2022 22:20:00 BRONX OUTSIDE 48 0.0 STREET (null) True ... (null) (null) 18-24 M BLACK 1016802.000 250581.000000 40.854402 -73.882330 POINT (-73.88233 40.854402)
2 84967535 05/27/2012 19:35:00 QUEENS NaN 103 0.0 NaN NaN False ... NaN NaN 18-24 M BLACK 1048632.000 198262.000000 40.710634 -73.767773 POINT (-73.76777349199995 40.71063412500007)
3 202853370 09/24/2019 21:00:00 BRONX NaN 42 0.0 NaN NaN False ... M UNKNOWN 25-44 M BLACK 1014493.000 242565.000000 40.832417 -73.890714 POINT (-73.89071440599997 40.832416753000075)
4 27078636 02/25/2007 21:00:00 BROOKLYN NaN 83 0.0 NaN NaN False ... M BLACK 25-44 M BLACK 1009149.375 190104.703125 40.688443 -73.910219 POINT (-73.91021857399994 40.68844345900004)

5 rows × 21 columns

In [3]:
# Correcting data types and extracting valuable info
df["OCCUR_DATE"] = pd.to_datetime(df["OCCUR_DATE"])
df["Hour"] = df["OCCUR_TIME"].str[:2].astype(int)
df["DayOfWeek"] = df["OCCUR_DATE"].dt.day_name()
df["Month"] = df["OCCUR_DATE"].dt.month
df.drop(columns=["OCCUR_TIME"], inplace=True)

# Bin Month into Season
season_mapping = {0: 'Winter', 1: 'Spring', 2: 'Summer', 3: 'Fall'}
df["Season"] = (df["Month"] % 12 // 3).map(season_mapping)

# Bin Hour into TimeofDay
df["TimeofDay"] = pd.cut(
    df["Hour"],
    bins=[-1, 4, 11, 16, 20, 24],
    labels=["Night", "Morning", "Afternoon", "Evening", "Night"],
    right=True,
    ordered=False
)
In [4]:
df["STATISTICAL_MURDER_FLAG"] = df["STATISTICAL_MURDER_FLAG"].astype(int)

# Murder to Non-Murder Counts and Ratio
murder_counts = df['STATISTICAL_MURDER_FLAG'].value_counts().reset_index()
murder_counts.columns = ['MurderFlag', 'Count']
murder_counts['Proportion'] = murder_counts['Count'] / murder_counts['Count'].sum()

base_chart = alt.Chart(murder_counts).encode(
    x=alt.X('MurderFlag:N', title='Murder (1) or Non-Murder (0)'),
    tooltip=[
        alt.Tooltip('MurderFlag:N', title='MurderFlag'),
        alt.Tooltip('Count:Q', title='Incident Count'),
        alt.Tooltip('Proportion:Q', format='.2%', title='Proportion of Total Incidents')
    ]
)

# Bar chart
bar_chart = base_chart.mark_bar().encode(
    y=alt.Y('Count:Q', title='Incident Count', axis=alt.Axis(titleColor='blue')),
    color=alt.Color('MurderFlag:N', scale=alt.Scale(scheme='category10'), legend=None)
)

# Add invisible line plot to add proportion axis
line_chart = base_chart.mark_line(opacity=0).encode(
    y=alt.Y('Proportion:Q', title='Proportion of Total Incidents (%)',
            axis=alt.Axis(format='.0%', titleColor='orange'))
)

# Combine everything
murder_counts = alt.layer(bar_chart, line_chart).resolve_scale(
    y='independent'
).properties(
    title='Murder vs. Non-Murder Incidents',
    width=400,
    height=300
)

murder_counts
Out[4]:
In [5]:
# Count Per Season and Boro
season_borough_counts = df.groupby(["Season", "BORO"]).size().reset_index(name="Count")
season_borough_counts["Proportion"] = season_borough_counts.groupby("Season")["Count"].transform("sum") / df.shape[0]

bar_chart = alt.Chart(season_borough_counts).mark_bar().encode(
    x=alt.X("Season:N", title="Season", sort=['Winter', 'Spring', 'Summer', 'Fall']),
    y=alt.Y("Count:Q", title="Incident Count", axis=alt.Axis(titleColor='blue')),
    color=alt.Color("BORO:N", legend=alt.Legend(title="Borough")),
    tooltip=[
        alt.Tooltip("Season:N", title="Season"),
        alt.Tooltip("BORO:N", title="Borough"),
        alt.Tooltip("Count:Q", title="Incident Count"),
        alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total (Season)")
    ]
)

season_borough_chart = bar_chart.properties(
    title="Seasonal Trends in Incident Counts by Borough",
    width=800,
    height=400
)

season_borough_chart
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[5]:
In [6]:
# Get the start date
df["Month"] = df["OCCUR_DATE"].dt.to_period("M").dt.start_time

# Incident Count by Month and Season
monthly_counts = df.groupby(["Month", "Season"]).size().reset_index(name="Count")

# Dates of Notable events
event_dates = pd.DataFrame({
    "EventDate": pd.to_datetime(["2020-03-01", "2020-06-01", "2021-01-06"]),
    "EventDescription": ["COVID-19 Pandemic Start", "George Floyd Protests", "Capitol Riots"]
})

# Base line chart
monthly_chart = alt.Chart(monthly_counts).mark_line().encode(
    x=alt.X("Month:T", title="Month"),
    y=alt.Y("Count:Q", title="Incident Count"),
    tooltip=["Month:T", "Count:Q"]
).properties(
    title="Incident Frequency Over Time (Monthly Aggregation)",
    width=800,
    height=400
)

# Event lines
event_lines = alt.Chart(event_dates).mark_rule(color="red", strokeDash=[5, 3]).encode(
    x="EventDate:T",
    tooltip=["EventDate:T", "EventDescription:N"]
)

# Seasonality highlights
season_colors = alt.Chart(monthly_counts).mark_area(opacity=0.2).encode(
    x="Month:T",
    y="Count:Q",
    color=alt.Color("Season:N", scale=alt.Scale(domain=["Winter", "Spring", "Summer", "Fall"], 
                                                range=["#377eb8", "#4daf4a", "#ffcc00", "#a65628"])),
    tooltip=["Month:T", "Season:N"]
)

# Combine the layers
monthly_incident_chart = (monthly_chart + season_colors + event_lines).interactive()

monthly_incident_chart
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[6]:
In [7]:
# Aggregate incidents by time of day and murder vs non-murder
time_of_day_counts = df.groupby(['Hour', 'STATISTICAL_MURDER_FLAG']).size().reset_index(name='Count')
time_of_day_counts['STATISTICAL_MURDER_FLAG'] = time_of_day_counts['STATISTICAL_MURDER_FLAG'].replace(
    {0: 'Non-Murder', 1: 'Murder'}
)


base_chart = alt.Chart(time_of_day_counts).encode(
    x=alt.X('Hour:O', title='Hour of Day')
)

area_chart = base_chart.mark_area(opacity=0.7).encode(
    y=alt.Y('Count:Q', title='Incident Count'),
    color=alt.Color('STATISTICAL_MURDER_FLAG:N', legend=alt.Legend(title="Incident Type")),
    tooltip=['Hour:O', 'STATISTICAL_MURDER_FLAG:N', 'Count:Q']
)

time_of_day_murdernonmurder_chart = area_chart.properties(
    title='Time of Day Patterns by Incident Type',
    width=700,
    height=400
)

time_of_day_murdernonmurder_chart
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[7]:
In [8]:
# Counts by Hour and DayofWeek
hour_day_counts = df.groupby(["Hour", "DayOfWeek"]).size().reset_index(name="Count")

# Sort DayofWeek
day_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

# Heatmap
heatmap = alt.Chart(hour_day_counts).mark_rect().encode(
    x=alt.X("Hour:O", title="Hour of Day"),
    y=alt.Y("DayOfWeek:O", sort=day_order, title="Day of Week"),
    color=alt.Color("Count:Q", scale=alt.Scale(scheme="viridis"), title="Incident Count"),
    tooltip=[
        alt.Tooltip("Hour:O", title="Hour of Day"),
        alt.Tooltip("DayOfWeek:O", title="Day of Week"),
        alt.Tooltip("Count:Q", title="Incident Count")
    ]
)

heatmap.properties(
    title="Incident Frequency by Hour and Day of Week",
    width=800,
    height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[8]:
In [9]:
# Define Age Bins
included_age_groups = ["<18", "18-24", "25-44", "45-64", "65+"]  # Define age group order
df["AGE_GROUP"] = df["VIC_AGE_GROUP"].apply(lambda x: x if x in included_age_groups else "Other")

# Count incidents by age group
age_group_counts = df["AGE_GROUP"].value_counts().reset_index()
age_group_counts.columns = ["Age Group", "Count"]
age_group_counts["Proportion"] = age_group_counts["Count"] / age_group_counts["Count"].sum()

# Sort age
age_group_counts["Age Group"] = pd.Categorical(
    age_group_counts["Age Group"], categories=included_age_groups + ["Other"], ordered=True
)

age_group_chart = alt.Chart(age_group_counts).mark_bar().encode(
    x=alt.X("Age Group:N", title="Age Group", sort=included_age_groups + ["Other"]),
    y=alt.Y("Count:Q", title="Incident Count", axis=alt.Axis(titleColor="blue")),
    color=alt.Color("Age Group:N", legend=None),
    tooltip=[
        alt.Tooltip("Age Group:N", title="Age Group"),
        alt.Tooltip("Count:Q", title="Incident Count"),
        alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
    ]
)

age_group_chart.properties(
    title="Incident Distribution by Age Group",
    width=800,
    height=400
)
Out[9]:
In [10]:
# Prepare Race Grouping
included_races = ["BLACK", "BLACK HISPANIC", "WHITE", "WHITE HISPANIC", "ASIAN/PACIFIC ISLANDER"]
df["RACE_GROUP"] = df["VIC_RACE"].apply(lambda x: x if x in included_races else "Other")

# Incident by race count
race_group_counts = df["RACE_GROUP"].value_counts().reset_index()
race_group_counts.columns = ["Race", "Count"]
race_group_counts["Proportion"] = race_group_counts["Count"] / race_group_counts["Count"].sum()

# Sort highest to lowest
race_group_counts = race_group_counts.sort_values("Count", ascending=False)
sorted_races = race_group_counts["Race"].tolist()  # Extract sorted race order

race_group_chart = alt.Chart(race_group_counts).mark_bar().encode(
    x=alt.X("Race:N", title="Race", sort=sorted_races),
    y=alt.Y("Count:Q", title="Incident Count", axis=alt.Axis(titleColor="blue")),
    color=alt.Color("Race:N", legend=None),
    tooltip=[
        alt.Tooltip("Race:N", title="Race"),
        alt.Tooltip("Count:Q", title="Incident Count"),
        alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
    ]
)

race_group_chart.properties(
    title="Incident Distribution by Race (Ordered by Count)",
    width=800,
    height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[10]:
In [11]:
# Valuecount location description column
location_desc_counts = df["LOCATION_DESC"].value_counts().reset_index()
location_desc_counts.columns = ["Location Description", "Count"]

# Assign null valus as "Unknown" and smaller categories as "Other"
threshold = 100
location_desc_counts["Location Description"] = location_desc_counts["Location Description"].apply(
    lambda x: "Unknown" if x in ["(null)", "NONE"] 
    else "Other" if location_desc_counts.loc[location_desc_counts["Location Description"] == x, "Count"].values[0] < threshold 
    else x
)

# Recalculate counts and proportions after grouping
location_desc_counts = location_desc_counts.groupby("Location Description").agg({"Count": "sum"}).reset_index()
location_desc_counts["Proportion"] = location_desc_counts["Count"] / location_desc_counts["Count"].sum()

loc_desc_chart = alt.Chart(location_desc_counts).mark_bar().encode(
    x=alt.X("Count:Q", title="Count"),
    y=alt.Y("Location Description:N", title="Location Description", sort="-x"),
    color=alt.Color("Location Description:N", title="Category", legend=alt.Legend(title="Location Description")),
    tooltip=[
        alt.Tooltip("Location Description:N", title="Location Description"),
        alt.Tooltip("Count:Q", title="Count"),
        alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
    ]
)

loc_desc_chart.properties(
    title="Distribution of Location Descriptions (Binned with Unknown and Other Categories)",
    width=800,
    height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[11]:
In [12]:
# Value counts Location Classification Description column
loc_class_counts = df["LOC_CLASSFCTN_DESC"].value_counts().reset_index()
loc_class_counts.columns = ["Location Classification Description", "Count"]

# Group smaller categories into "Other"
threshold = 100
loc_class_counts["Location Classification Description"] = loc_class_counts["Location Classification Description"].apply(
    lambda x: "Other" if loc_class_counts.loc[loc_class_counts["Location Classification Description"] == x, "Count"].values[0] < threshold else x
)

# Recalculate counts and proportions after grouping
loc_class_counts = loc_class_counts.groupby("Location Classification Description").agg({"Count": "sum"}).reset_index()
loc_class_counts["Proportion"] = loc_class_counts["Count"] / loc_class_counts["Count"].sum()

loc_class_chart = alt.Chart(loc_class_counts).mark_bar().encode(
    x=alt.X("Count:Q", title="Count"),
    y=alt.Y("Location Classification Description:N", title="Location Classification Description", sort="-x"),
    color=alt.Color("Location Classification Description:N", title="Category", legend=alt.Legend(title="Location Classification Description")),
    tooltip=[
        alt.Tooltip("Location Classification Description:N", title="Location Classification Description"),
        alt.Tooltip("Count:Q", title="Count"),
        alt.Tooltip("Proportion:Q", format=".2%", title="Proportion of Total Incidents")
    ]
)

loc_class_chart.properties(
    title="Distribution of Location Classification Descriptions",
    width=800,
    height=400
)
C:\Users\julie\anaconda3\Lib\site-packages\altair\utils\core.py:395: FutureWarning: the convert_dtype parameter is deprecated and will be removed in a future version.  Do ``ser.astype(object).apply()`` instead if you want ``convert_dtype=False``.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
Out[12]:
In [13]:
shooting_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)
marker_cluster = MarkerCluster().add_to(shooting_map)

for _, row in df.dropna(subset=["Latitude", "Longitude"]).iterrows():
    folium.CircleMarker(
        location=[row["Latitude"], row["Longitude"]],
        radius=3,
        color="red",
        fill=True,
        fill_color="red"
    ).add_to(marker_cluster)

shooting_map.save("shooting_map.html")
shooting_map
Out[13]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Conclusion¶

Discussion¶

  • I've concluded that solely through the use of Exploratory Data Analysis, we were able to extract meangingful information regarding the shooting incidents in New York City and can make conclusive decisions regarding the hows, whens, wheres, and whos are involved in this shootings and in our interest - how to avoid being involved in one.
  • My parents and girlfriend helped to evaluate the results of my analysis and were thoroughly impressed with the insights gathered through my Exploratory Data Analysis.
  • The Folium Mapping was found to be the most favorited and insightful visualization from this research.
  • I conclude that meaningful and conclusive insights were obtained through the process of Exploratory Data Analysis without the need to do any data modeling.